This report is a guidebook with selected Q&A aims at finding out what makes women happy.
The whole dataset: HappyDB is a corpus of 100,000 crowd-sourced happy moments via Amazon’s Mechanical Turk. You can read more about it on https://arxiv.org/abs/1801.07746. But I only use a subset of this dataset to conduct data analysis in this report.
library(tidyverse)
library(tidytext)
library(DT)
library(scales)
library(wordcloud)
library(gridExtra)
library(ngram)
library(shiny)
library(tm)
urlfile<-'https://raw.githubusercontent.com/rit-public/HappyDB/master/happydb/data/cleaned_hm.csv'
hm_data <- read_csv(urlfile)
corpus <- VCorpus(VectorSource(hm_data$cleaned_hm))%>%
tm_map(content_transformer(tolower))%>%
tm_map(removePunctuation)%>%
tm_map(removeNumbers)%>%
tm_map(removeWords, character(0))%>%
tm_map(stripWhitespace)
stemmed <- tm_map(corpus, stemDocument) %>%
tidy() %>%
select(text)
dict <- tidy(corpus) %>%
select(text) %>%
unnest_tokens(dictionary, text)
## Warning: Outer names are only allowed for unnamed scalar atomic inputs
data("stop_words")
word <- c("happy","ago","yesterday","lot","today","months","month",
"happier","happiest","last","week","past")
stop_words <- stop_words %>%
bind_rows(mutate(tibble(word), lexicon = "updated"))
completed <- stemmed %>%
mutate(id = row_number()) %>%
unnest_tokens(stems, text) %>%
bind_cols(dict) %>%
anti_join(stop_words, by = c("dictionary" = "word"))
## Warning: Outer names are only allowed for unnamed scalar atomic inputs
completed <- completed %>%
group_by(stems) %>%
count(dictionary) %>%
mutate(word = dictionary[which.max(n)]) %>%
ungroup() %>%
select(stems, word) %>%
distinct() %>%
right_join(completed) %>%
select(-stems)
completed <- completed %>%
group_by(id) %>%
summarise(text = str_c(word, collapse = " ")) %>%
ungroup()
hm_data <- hm_data %>%
mutate(id = row_number()) %>%
inner_join(completed)
datatable(hm_data)